Introduction to PyMC

IS 5150/6110

Probabilistic Inference

\[ \LARGE{p(\theta | X) \propto p(X | \theta) \ p(\theta)} \]

Probabilistic Programming

Probabilistic programming languages (PPLs) provide for direct specification of probability distributions and come with built-in algorithms for inference. They help remove the computational burden of Bayesian statistics.

  1. You specify your model in terms of probability functions.
  2. The PPL automatically runs a sampler (or approximation) for your model.

Most samplers are some form of Markov chain Monte Carlo—with Hamiltonian Monte Carlo currently best-in-class. We’ll be using PyMC.

import numpy as np
import polars as pl
import pymc as pm
import arviz as az

np.random.seed(42)

# Set the parameter values.
beta0 = 3
beta1 = 7
sigma = 3
n = 100

# Simulate data.
x = np.random.uniform(0, 7, size = n)
y = beta0 + beta1 * x + np.random.normal(size = n) * sigma

# Create a Model object.
basic_model = pm.Model()

# Specify the model.
with basic_model:
  # Prior.
  beta = pm.Normal('beta', mu = 0, sigma = 10, shape = 2)
  sigma = pm.HalfNormal('sigma', sigma = 1)

  # Likelihood.
  mu = beta[0] + beta[1] * x
  y_obs = pm.Normal('y_obs', mu = mu, sigma = sigma, observed = y)

# Create an InferenceData object.
with basic_model:
  # Draw 1000 posterior samples.
  idata = pm.sample()

# Have we recovered the parameters?
az.summary(idata, round_to = 2)

# Visualize marginal posteriors.
az.plot_trace(idata, combined = True)

# Import (standardized) fox data.
foxes = pl.read_csv('../data/foxes.csv')

# Separate predictors and the outcome.
X = foxes.select(pl.col(['avgfood', 'groupsize'])).to_numpy()
y = foxes.select(pl.col('weight')).to_numpy().flatten()

y
array([ 4.14134697e-01, -1.42704642e+00,  6.75954030e-01,  1.30094212e+00,
        1.11513485e+00, -1.08076924e+00,  2.91233964e-04, -3.71323304e-01,
        1.35161683e+00,  8.95544439e-01,  1.09824328e+00, -5.06455863e-01,
       -1.60178680e-01,  1.31783369e+00,  1.95971334e+00,  1.28405055e+00,
       -1.11455238e+00, -6.83817347e-01, -5.14901648e-01,  9.96893858e-01,
        1.19959270e+00,  1.10086438e-01,  1.42762889e+00,  1.83302657e+00,
        1.42762889e+00, -2.20405864e+00, -1.85516035e-01, -9.28745111e-01,
        1.21648427e+00,  6.50616675e-01, -5.48684788e-01, -1.13988973e+00,
       -1.47772113e+00, -1.30035965e+00, -8.02058336e-01,  1.96815913e+00,
       -2.44636530e-01, -9.03407756e-01, -2.86865454e-01, -1.84088989e+00,
        1.60761148e-01, -9.26124005e-02,  2.53664782e-01,  1.25871319e+00,
        1.09824328e+00,  4.14134697e-01,  7.09737170e-01, -2.12804657e+00,
       -2.36190745e-01, -1.26395540e-01,  2.62110567e-01, -1.13988973e+00,
        8.47490835e-02, -5.65576358e-01,  2.45218998e-01,  1.69789401e+00,
       -1.04698610e+00, -4.55781153e-01, -5.82467928e-01, -4.64226938e-01,
       -7.57208306e-02,  1.77652718e-01, -7.00708917e-01,  1.86098503e-01,
        5.74604611e-01,  1.05601435e+00, -2.50461209e-02, -1.05543188e+00,
       -7.17600487e-01, -6.83817347e-01, -2.02407605e-01, -8.02058336e-01,
       -6.58479992e-01, -1.57062477e+00,  7.60411880e-01,  1.50364096e+00,
        1.06446014e+00,  9.31948684e-02,  6.84399815e-01,  2.55091829e+00,
        2.02990073e-01, -8.86516186e-01, -1.68624465e-01,  4.22580482e-01,
       -3.71323304e-01, -3.54431734e-01,  7.94195019e-01,  1.45296625e+00,
        1.86098503e-01,  6.67508245e-01,  7.26628740e-01, -5.82467928e-01,
       -1.16522709e+00,  5.40821471e-01, -9.26124005e-02,  1.01640653e-01,
       -6.16251067e-01, -9.45636681e-01, -3.45985949e-01, -1.05543188e+00,
       -5.06455863e-01, -1.32569700e+00, -1.67197419e+00,  3.97243127e-01,
       -1.43287110e-01,  2.17085797e+00,  1.10668906e+00,  9.71556503e-01,
       -1.86622724e+00,  3.97243127e-01, -6.07805283e-01,  2.36773213e-01,
       -4.98010078e-01, -1.15678130e+00, -1.47772113e+00, -5.65576358e-01])

# Estimate the direct causal effect of avgfood on weight.
with pm.Model() as foxes_model:
  # Data.
  X_data = pm.Data('X_data', X)
  y_data = pm.Data('y_data', y)

  # Priors.
  alpha = pm.Normal('alpha', mu = 0, sigma = 0.2)
  beta = pm.Normal('beta', mu = 0, sigma = 0.5, shape = 2)
  sigma = pm.Exponential('sigma', lam = 1)

  # Likelihood.
  mu = alpha + X_data @ beta
  y_obs = pm.Normal('y_obs', mu = mu, sigma = sigma, observed = y_data)

# Sample.
with foxes_model:
  draws = pm.sample()

# Visualize marginal posteriors.
az.plot_forest(draws, var_names=['beta'], combined = True, hdi_prob=0.95)

# Sample from the prior predictive distribution.
with foxes_model:
    prior_draws = pm.sample_prior_predictive()

# Conduct a prior predictive check.
az.plot_dist(prior_draws.prior_predictive['y_obs'], label = 'prior predictive')
az.plot_dist(y, color = 'C1', label = 'observed')

# Sample from the posterior predictive distribution.
with foxes_model:
    posterior_draws = pm.sample_posterior_predictive(draws)

# Conduct a posterior predictive check.
az.plot_dist(posterior_draws.posterior_predictive['y_obs'], label = 'posterior predictive')
az.plot_dist(y, color = 'C1', label = 'observed')

  • PyMC has a lot of options, including a suite of distributions for priors and likelihood functions as well as ways to estimate Bayesian models.
  • ArviZ has many different visualization options for distributions, diagnostics, and prior and posterior predictive checks.
  • PyMC has a lot of flexibility, which is powerful but might also be scary. If you want a higher-level interface that behaves more like scikit-learn, look at Bambi. It uses PyMC and ArviZ as well but in the background.